import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
# plotly to be shown in notebook
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
# Annotations in visual studio
import plotly.io as pio
pio.renderers.default = "notebook"
df = pd.read_pickle('output/crowdfunding_datapreprocessed.pkl')
df
| Funded | Requested | Difference amount | Covered amount | Funded allocation | Activity | Sector | Use | Country code | Country | Region | Currency | Term in months | Lenders | Borrower genders | Repayment interval | Credit types | Population | Borrowers | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 300 | 300 | 0.0 | 100.0 | 0.130725 | Fruits & Vegetables | Food | To buy seasonal, fresh fruits to sell. | PK | Pakistan | Lahore | PKR | 12 | 12 | female | irregular | bronze | 2.294890e+05 | 1.0 |
| 1 | 575 | 575 | 0.0 | 100.0 | 0.250557 | Rickshaw | Transportation | to repair and maintain the auto rickshaw used ... | PK | Pakistan | Lahore | PKR | 11 | 14 | female, female | irregular | bronze | 2.294890e+05 | 2.0 |
| 2 | 150 | 150 | 0.0 | 100.0 | 0.010664 | Transportation | Transportation | To repair their old cycle-van and buy another ... | IN | India | Maynaguri | INR | 43 | 6 | female | bullet | bronze | 1.406632e+06 | 1.0 |
| 3 | 200 | 200 | 0.0 | 100.0 | 0.087150 | Embroidery | Arts | to purchase an embroidery machine and a variet... | PK | Pakistan | Lahore | PKR | 11 | 8 | female | irregular | bronze | 2.294890e+05 | 1.0 |
| 4 | 400 | 400 | 0.0 | 100.0 | 0.174300 | Milk Sales | Food | to purchase one buffalo. | PK | Pakistan | Abdul Hakeem | PKR | 14 | 16 | female | monthly | bronze | 2.294890e+05 | 1.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 646828 | 0 | 25 | 100.0 | 0.0 | 0.000000 | Livestock | Agriculture | [True, u'para compara: cemento, arenya y ladri... | PY | Paraguay | Concepción | USD | 13 | 0 | female | monthly | bronze | 7.305842e+03 | 1.0 |
| 646829 | 0 | 25 | 100.0 | 0.0 | 0.000000 | Livestock | Agriculture | Reviewed loan use in english. | PK | Pakistan | Attock | PKR | 13 | 0 | female | monthly | bronze | 2.294890e+05 | 1.0 |
| 646830 | 0 | 125 | 100.0 | 0.0 | 0.000000 | Livestock | Agriculture | Pretend the flagged issue was addressed by KC. | MX | Mexico | Iztacalco | MXN | 13 | 0 | female, female | monthly | bronze | 1.315628e+05 | 2.0 |
| 646831 | 0 | 875 | 100.0 | 0.0 | 0.000000 | Livestock | Agriculture | Translated loan use to english. | BO | Bolivia | La Paz | BOB | 13 | 0 | female, female | monthly | bronze | 1.199265e+04 | 2.0 |
| 646832 | 0 | 250 | 100.0 | 0.0 | 0.000000 | Livestock | Agriculture | Reviewed loan use in english. | GH | Ghana | Dansoman | GHS | 13 | 0 | female | monthly | bronze | 3.239545e+04 | 1.0 |
646833 rows × 19 columns
df.columns
Index(['Funded', 'Requested', 'Difference amount', 'Covered amount',
'Funded allocation', 'Activity', 'Sector', 'Use', 'Country code',
'Country', 'Region', 'Currency', 'Term in months', 'Lenders',
'Borrower genders', 'Repayment interval', 'Credit types', 'Population',
'Borrowers'],
dtype='object')
# sns.pairplot(df)
corr = df.corr(method='pearson')
corr
| Funded | Requested | Difference amount | Covered amount | Funded allocation | Term in months | Lenders | Population | Borrowers | |
|---|---|---|---|---|---|---|---|---|---|
| Funded | 1.000000 | 0.944459 | -0.082333 | 0.082333 | 0.190441 | 0.154157 | 0.847734 | -0.052024 | 0.483999 |
| Requested | 0.944459 | 1.000000 | 0.119310 | -0.119310 | 0.182599 | 0.191184 | 0.796730 | -0.042510 | 0.465392 |
| Difference amount | -0.082333 | 0.119310 | 1.000000 | -1.000000 | -0.012559 | 0.137786 | -0.088321 | -0.014070 | -0.017926 |
| Covered amount | 0.082333 | -0.119310 | -1.000000 | 1.000000 | 0.012559 | -0.137786 | 0.088321 | 0.014070 | 0.017926 |
| Funded allocation | 0.190441 | 0.182599 | -0.012559 | 0.012559 | 1.000000 | 0.054697 | 0.183461 | -0.080867 | 0.064349 |
| Term in months | 0.154157 | 0.191184 | 0.137786 | -0.137786 | 0.054697 | 1.000000 | 0.233675 | 0.244807 | -0.146500 |
| Lenders | 0.847734 | 0.796730 | -0.088321 | 0.088321 | 0.183461 | 0.233675 | 1.000000 | -0.048289 | 0.291680 |
| Population | -0.052024 | -0.042510 | -0.014070 | 0.014070 | -0.080867 | 0.244807 | -0.048289 | 1.000000 | -0.036753 |
| Borrowers | 0.483999 | 0.465392 | -0.017926 | 0.017926 | 0.064349 | -0.146500 | 0.291680 | -0.036753 | 1.000000 |
corr.columns
Index(['Funded', 'Requested', 'Difference amount', 'Covered amount',
'Funded allocation', 'Term in months', 'Lenders', 'Population',
'Borrowers'],
dtype='object')
fig = px.imshow(corr, aspect='auto',
color_continuous_scale='Plasma', text_auto=True)
fig.update_traces(text=corr,
texttemplate="%{text:.2f}",
textfont={'size': 14}
)
fig.update_xaxes(# side="top",
tickfont=dict(size=14),
tickmode='array',
tickvals=corr.columns,
ticktext=corr.columns
)
fig.update_yaxes(tickmode='array',
ticktext=corr.columns,
tickvals=corr.columns,
tickfont=dict(size=14)
)
fig.update_layout(title={'text': "Overview of correlations",
'font_size': 16,
'y': 0.95,
'x': 0.55,
'xanchor': 'center',
'yanchor': 'top'},
autosize=False,
width=800,
height=600,
coloraxis=dict(colorbar=dict(title="Significance"))
)
fig.show()
pop_density_sel=pd.read_pickle('output/population.pkl')
Source for creation of the data base population: https://population.un.org/wpp/Download/Standard/CSV/
pop_density_sel.dtypes
Country object PopTotal float64 dtype: object
pop_density_sel
| Country | PopTotal | |
|---|---|---|
| 0 | Afghanistan | 40754.385 |
| 1 | Africa | 1406728.760 |
| 2 | African Group | 1404902.167 |
| 3 | African Union | 1405528.335 |
| 4 | African Union: Central Africa | 168061.108 |
| ... | ... | ... |
| 469 | World | 7953952.577 |
| 470 | World Bank Regional Groups (developing only) | 6679106.813 |
| 471 | Yemen | 31154.866 |
| 472 | Zambia | 19470.238 |
| 473 | Zimbabwe | 15331.426 |
474 rows × 2 columns
pop_density_all = pd.read_csv('input/WPP2019_TotalPopulationBySex.csv')
pop_density_all
| LocID | Location | VarID | Variant | Time | MidPeriod | PopMale | PopFemale | PopTotal | PopDensity | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 4 | Afghanistan | 2 | Medium | 1950 | 1950.5 | 4099.243 | 3652.874 | 7752.117 | 11.874 |
| 1 | 4 | Afghanistan | 2 | Medium | 1951 | 1951.5 | 4134.756 | 3705.395 | 7840.151 | 12.009 |
| 2 | 4 | Afghanistan | 2 | Medium | 1952 | 1952.5 | 4174.450 | 3761.546 | 7935.996 | 12.156 |
| 3 | 4 | Afghanistan | 2 | Medium | 1953 | 1953.5 | 4218.336 | 3821.348 | 8039.684 | 12.315 |
| 4 | 4 | Afghanistan | 2 | Medium | 1954 | 1954.5 | 4266.484 | 3884.832 | 8151.316 | 12.486 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 280927 | 716 | Zimbabwe | 207 | Lower 95 PI | 2080 | 2080.5 | 10576.533 | 11255.983 | 21836.893 | 56.448 |
| 280928 | 716 | Zimbabwe | 207 | Lower 95 PI | 2085 | 2085.5 | 10293.349 | 11050.875 | 21355.988 | 55.205 |
| 280929 | 716 | Zimbabwe | 207 | Lower 95 PI | 2090 | 2090.5 | 9920.336 | 10767.709 | 20689.956 | 53.483 |
| 280930 | 716 | Zimbabwe | 207 | Lower 95 PI | 2095 | 2095.5 | 9503.711 | 10412.184 | 19892.080 | 51.421 |
| 280931 | 716 | Zimbabwe | 207 | Lower 95 PI | 2100 | 2100.5 | 9090.075 | 9996.105 | 19061.177 | 49.273 |
280932 rows × 10 columns
Source of the .csv file: https://population.un.org/wpp/Download/Standard/CSV/
pop_density = pop_density_all.loc[pop_density_all.loc[:,'Time'] == 2022, ['Location', 'Time', 'PopDensity']]
pop_density
| Location | Time | PopDensity | |
|---|---|---|---|
| 72 | Afghanistan | 2022 | 62.424 |
| 153 | Afghanistan | 2022 | 62.610 |
| 234 | Afghanistan | 2022 | 62.295 |
| 315 | Afghanistan | 2022 | 62.872 |
| 396 | Afghanistan | 2022 | 61.211 |
| ... | ... | ... | ... |
| 280444 | Zimbabwe | 2022 | 39.056 |
| 280525 | Zimbabwe | 2022 | 40.046 |
| 280606 | Zimbabwe | 2022 | 39.584 |
| 280687 | Zimbabwe | 2022 | 39.719 |
| 280768 | Zimbabwe | 2022 | 39.335 |
2757 rows × 3 columns
pop_density_drop = pop_density.drop_duplicates(subset=['Location'], ignore_index=True)
pop_density_drop.reset_index()
pop_density_drop
| Location | Time | PopDensity | |
|---|---|---|---|
| 0 | Afghanistan | 2022 | 62.424 |
| 1 | Africa | 2022 | 47.447 |
| 2 | African Group | 2022 | 47.820 |
| 3 | African Union | 2022 | 47.412 |
| 4 | African Union: Central Africa | 2022 | 31.855 |
| ... | ... | ... | ... |
| 469 | World | 2022 | 61.140 |
| 470 | World Bank Regional Groups (developing only) | 2022 | 72.725 |
| 471 | Yemen | 2022 | 59.009 |
| 472 | Zambia | 2022 | 26.191 |
| 473 | Zimbabwe | 2022 | 39.631 |
474 rows × 3 columns
pop_sel = pop_density_drop[['Location', 'PopDensity']]
pop_sel
| Location | PopDensity | |
|---|---|---|
| 0 | Afghanistan | 62.424 |
| 1 | Africa | 47.447 |
| 2 | African Group | 47.820 |
| 3 | African Union | 47.412 |
| 4 | African Union: Central Africa | 31.855 |
| ... | ... | ... |
| 469 | World | 61.140 |
| 470 | World Bank Regional Groups (developing only) | 72.725 |
| 471 | Yemen | 59.009 |
| 472 | Zambia | 26.191 |
| 473 | Zimbabwe | 39.631 |
474 rows × 2 columns
pop_sel_2 = pop_sel.copy()
pop_sel_2.rename(mapper={'Location':'Country'}, axis = 1, inplace = True)
pop_sel_2.columns
Index(['Country', 'PopDensity'], dtype='object')
pop_sel_2.set_index('Country')
| PopDensity | |
|---|---|
| Country | |
| Afghanistan | 62.424 |
| Africa | 47.447 |
| African Group | 47.820 |
| African Union | 47.412 |
| African Union: Central Africa | 31.855 |
| ... | ... |
| World | 61.140 |
| World Bank Regional Groups (developing only) | 72.725 |
| Yemen | 59.009 |
| Zambia | 26.191 |
| Zimbabwe | 39.631 |
474 rows × 1 columns
data_population = pop_sel_2.to_pickle('output/population.pkl')
loan_funded = df.groupby(['Country']).agg({'Country': 'size','Requested': sum, 'Funded': sum})
loan_funded
| Country | Requested | Funded | |
|---|---|---|---|
| Country | |||
| Afghanistan | 2 | 14000 | 14000 |
| Albania | 1934 | 2666500 | 2490000 |
| Armenia | 8629 | 12913075 | 11184350 |
| Azerbaijan | 1942 | 2886350 | 2697225 |
| Belize | 124 | 113500 | 113500 |
| ... | ... | ... | ... |
| Vietnam | 9534 | 14154350 | 12928975 |
| Virgin Islands | 2 | 10000 | 0 |
| Yemen | 2313 | 1879950 | 1784075 |
| Zambia | 775 | 1205075 | 1142350 |
| Zimbabwe | 3973 | 3415900 | 3341225 |
86 rows × 3 columns
loan_funded.rename(mapper={'Country':'Nr of projects'}, axis = 1, inplace = True)
loan_funded.columns
Index(['Nr of projects', 'Requested', 'Funded'], dtype='object')
loan_funded
| Nr of projects | Requested | Funded | |
|---|---|---|---|
| Country | |||
| Afghanistan | 2 | 14000 | 14000 |
| Albania | 1934 | 2666500 | 2490000 |
| Armenia | 8629 | 12913075 | 11184350 |
| Azerbaijan | 1942 | 2886350 | 2697225 |
| Belize | 124 | 113500 | 113500 |
| ... | ... | ... | ... |
| Vietnam | 9534 | 14154350 | 12928975 |
| Virgin Islands | 2 | 10000 | 0 |
| Yemen | 2313 | 1879950 | 1784075 |
| Zambia | 775 | 1205075 | 1142350 |
| Zimbabwe | 3973 | 3415900 | 3341225 |
86 rows × 3 columns
loan_funded_pop_dens = loan_funded.merge(pop_sel_2, on='Country', how='left')
loan_funded_pop_dens
| Country | Nr of projects | Requested | Funded | PopDensity | |
|---|---|---|---|---|---|
| 0 | Afghanistan | 2 | 14000 | 14000 | 62.424 |
| 1 | Albania | 1934 | 2666500 | 2490000 | 104.612 |
| 2 | Armenia | 8629 | 12913075 | 11184350 | 104.389 |
| 3 | Azerbaijan | 1942 | 2886350 | 2697225 | 124.612 |
| 4 | Belize | 124 | 113500 | 113500 | 18.071 |
| ... | ... | ... | ... | ... | ... |
| 81 | Vietnam | 9534 | 14154350 | 12928975 | NaN |
| 82 | Virgin Islands | 2 | 10000 | 0 | NaN |
| 83 | Yemen | 2313 | 1879950 | 1784075 | 59.009 |
| 84 | Zambia | 775 | 1205075 | 1142350 | 26.191 |
| 85 | Zimbabwe | 3973 | 3415900 | 3341225 | 39.631 |
86 rows × 5 columns
nan_sum = loan_funded_pop_dens.isna().sum()
nan_sum[nan_sum > 0].sort_values(ascending=False)
PopDensity 10 dtype: int64
m_v = loan_funded_pop_dens.loc[loan_funded_pop_dens.loc[:,'PopDensity'].isnull()]
m_v
| Country | Nr of projects | Requested | Funded | PopDensity | |
|---|---|---|---|---|---|
| 7 | Bolivia | 8799 | 19837925 | 18275575 | NaN |
| 18 | Cote D'Ivoire | 1 | 50000 | 50000 | NaN |
| 35 | Kosovo | 1419 | 1947750 | 1778600 | NaN |
| 46 | Moldova | 348 | 718725 | 686850 | NaN |
| 49 | Myanmar (Burma) | 1870 | 3242300 | 3035850 | NaN |
| 55 | Palestine | 8160 | 13063325 | 12024450 | NaN |
| 72 | Tanzania | 5214 | 6826200 | 6516550 | NaN |
| 79 | United States | 6092 | 31495375 | 23158540 | NaN |
| 81 | Vietnam | 9534 | 14154350 | 12928975 | NaN |
| 82 | Virgin Islands | 2 | 10000 | 0 | NaN |
countries = m_v['Country']
densities = [11.102, 87.24, 162.5, 122.167, 84.537, 887.964, 11.027, 16.977, 36.601, 319.133, 204.04]
combi = dict(zip(countries, densities))
for k,v in combi.items():
loan_funded_pop_dens.loc[(loan_funded_pop_dens['Country'] == k) & (loan_funded_pop_dens['PopDensity'].isnull()), 'PopDensity'] = v
loan_funded_pop_dens.isnull().sum()
Country 0 Nr of projects 0 Requested 0 Funded 0 PopDensity 0 dtype: int64
loan_funded_pop_dens.loc[loan_funded_pop_dens['Country'] == 'Bolivia']
| Country | Nr of projects | Requested | Funded | PopDensity | |
|---|---|---|---|---|---|
| 7 | Bolivia | 8799 | 19837925 | 18275575 | 11.102 |
loan_funded_pop_dens.rename(mapper={'PopDensity':'Population density'}, axis = 1, inplace = True)
loan_funded_pop_dens.columns
Index(['Country', 'Nr of projects', 'Requested', 'Funded',
'Population density'],
dtype='object')
loan_funded_pop_dens['Covered amount'] = (loan_funded_pop_dens['Funded']/loan_funded_pop_dens['Requested'])*100
loan_funded_pop_dens
| Country | Nr of projects | Requested | Funded | Population density | Covered amount | |
|---|---|---|---|---|---|---|
| 0 | Afghanistan | 2 | 14000 | 14000 | 62.424 | 100.000000 |
| 1 | Albania | 1934 | 2666500 | 2490000 | 104.612 | 93.380836 |
| 2 | Armenia | 8629 | 12913075 | 11184350 | 104.389 | 86.612600 |
| 3 | Azerbaijan | 1942 | 2886350 | 2697225 | 124.612 | 93.447607 |
| 4 | Belize | 124 | 113500 | 113500 | 18.071 | 100.000000 |
| ... | ... | ... | ... | ... | ... | ... |
| 81 | Vietnam | 9534 | 14154350 | 12928975 | 36.601 | 91.342767 |
| 82 | Virgin Islands | 2 | 10000 | 0 | 319.133 | 0.000000 |
| 83 | Yemen | 2313 | 1879950 | 1784075 | 59.009 | 94.900130 |
| 84 | Zambia | 775 | 1205075 | 1142350 | 26.191 | 94.794930 |
| 85 | Zimbabwe | 3973 | 3415900 | 3341225 | 39.631 | 97.813900 |
86 rows × 6 columns
loan_funded_pop_dens['Project amounts'] = (loan_funded_pop_dens['Nr of projects']/loan_funded_pop_dens['Population density'])*100
loan_funded_pop_dens
| Country | Nr of projects | Requested | Funded | Population density | Covered amount | Project amounts | |
|---|---|---|---|---|---|---|---|
| 0 | Afghanistan | 2 | 14000 | 14000 | 62.424 | 100.000000 | 3.203896 |
| 1 | Albania | 1934 | 2666500 | 2490000 | 104.612 | 93.380836 | 1848.736283 |
| 2 | Armenia | 8629 | 12913075 | 11184350 | 104.389 | 86.612600 | 8266.196630 |
| 3 | Azerbaijan | 1942 | 2886350 | 2697225 | 124.612 | 93.447607 | 1558.437390 |
| 4 | Belize | 124 | 113500 | 113500 | 18.071 | 100.000000 | 686.182281 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 81 | Vietnam | 9534 | 14154350 | 12928975 | 36.601 | 91.342767 | 26048.468621 |
| 82 | Virgin Islands | 2 | 10000 | 0 | 319.133 | 0.000000 | 0.626698 |
| 83 | Yemen | 2313 | 1879950 | 1784075 | 59.009 | 94.900130 | 3919.741056 |
| 84 | Zambia | 775 | 1205075 | 1142350 | 26.191 | 94.794930 | 2959.031728 |
| 85 | Zimbabwe | 3973 | 3415900 | 3341225 | 39.631 | 97.813900 | 10024.980445 |
86 rows × 7 columns
loan_funded_pop_dens['Funded allocation'] = (loan_funded_pop_dens['Funded'] / loan_funded_pop_dens['Population density'])*100
loan_funded_pop_dens
| Country | Nr of projects | Requested | Funded | Population density | Covered amount | Project amounts | Funded allocation | |
|---|---|---|---|---|---|---|---|---|
| 0 | Afghanistan | 2 | 14000 | 14000 | 62.424 | 100.000000 | 3.203896 | 2.242727e+04 |
| 1 | Albania | 1934 | 2666500 | 2490000 | 104.612 | 93.380836 | 1848.736283 | 2.380224e+06 |
| 2 | Armenia | 8629 | 12913075 | 11184350 | 104.389 | 86.612600 | 8266.196630 | 1.071411e+07 |
| 3 | Azerbaijan | 1942 | 2886350 | 2697225 | 124.612 | 93.447607 | 1558.437390 | 2.164499e+06 |
| 4 | Belize | 124 | 113500 | 113500 | 18.071 | 100.000000 | 686.182281 | 6.280781e+05 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 81 | Vietnam | 9534 | 14154350 | 12928975 | 36.601 | 91.342767 | 26048.468621 | 3.532410e+07 |
| 82 | Virgin Islands | 2 | 10000 | 0 | 319.133 | 0.000000 | 0.626698 | 0.000000e+00 |
| 83 | Yemen | 2313 | 1879950 | 1784075 | 59.009 | 94.900130 | 3919.741056 | 3.023395e+06 |
| 84 | Zambia | 775 | 1205075 | 1142350 | 26.191 | 94.794930 | 2959.031728 | 4.361613e+06 |
| 85 | Zimbabwe | 3973 | 3415900 | 3341225 | 39.631 | 97.813900 | 10024.980445 | 8.430837e+06 |
86 rows × 8 columns
Geomap
fig = px.scatter_geo(loan_funded_pop_dens, locations='Country', color='Funded', locationmode='country names', size='Funded allocation') # PopDensity
fig.show()
fig = px.scatter_geo(loan_funded_pop_dens, locations='Country', color='Nr of projects', locationmode='country names', size='Project amounts')
fig.show()
corr = loan_funded_pop_dens.corr(method='spearman')
corr
| Nr of projects | Requested | Funded | Population density | Covered amount | Project amounts | Funded allocation | |
|---|---|---|---|---|---|---|---|
| Nr of projects | 1.000000 | 0.937674 | 0.937863 | 0.144145 | -0.432759 | 0.848629 | 0.774000 |
| Requested | 0.937674 | 1.000000 | 0.999321 | 0.116034 | -0.441677 | 0.817048 | 0.833690 |
| Funded | 0.937863 | 0.999321 | 1.000000 | 0.113629 | -0.424318 | 0.817520 | 0.834407 |
| Population density | 0.144145 | 0.116034 | 0.113629 | 1.000000 | -0.185919 | -0.307657 | -0.371283 |
| Covered amount | -0.432759 | -0.441677 | -0.424318 | -0.185919 | 1.000000 | -0.326698 | -0.313282 |
| Project amounts | 0.848629 | 0.817048 | 0.817520 | -0.307657 | -0.326698 | 1.000000 | 0.949262 |
| Funded allocation | 0.774000 | 0.833690 | 0.834407 | -0.371283 | -0.313282 | 0.949262 | 1.000000 |
loan_funded = df.groupby(['Country', 'Credit types']).agg({'Country': 'size','Covered amount': sum, 'Funded allocation': sum})
loan_funded
| Country | Covered amount | Funded allocation | ||
|---|---|---|---|---|
| Country | Credit types | |||
| Afghanistan | bronze | 2 | 200.00000 | 34.352132 |
| silver | 0 | 0.00000 | 0.000000 | |
| gold | 0 | 0.00000 | 0.000000 | |
| platinum | 0 | 0.00000 | 0.000000 | |
| Albania | bronze | 1933 | 182550.71875 | 85124.850944 |
| ... | ... | ... | ... | ... |
| Zambia | platinum | 0 | 0.00000 | 0.000000 |
| Zimbabwe | bronze | 3972 | 392087.71875 | 21467.181200 |
| silver | 1 | 100.00000 | 326.127524 | |
| gold | 0 | 0.00000 | 0.000000 | |
| platinum | 0 | 0.00000 | 0.000000 |
344 rows × 3 columns
loan_funded = df.groupby(['Country', 'Credit types']).agg({'Country': 'size','Requested': sum, 'Funded': sum})
loan_funded
| Country | Requested | Funded | ||
|---|---|---|---|---|
| Country | Credit types | |||
| Afghanistan | bronze | 2 | 14000 | 14000 |
| silver | 0 | 0 | 0 | |
| gold | 0 | 0 | 0 | |
| platinum | 0 | 0 | 0 | |
| Albania | bronze | 1933 | 2616500 | 2440000 |
| ... | ... | ... | ... | ... |
| Zambia | platinum | 0 | 0 | 0 |
| Zimbabwe | bronze | 3972 | 3365900 | 3291225 |
| silver | 1 | 50000 | 50000 | |
| gold | 0 | 0 | 0 | |
| platinum | 0 | 0 | 0 |
344 rows × 3 columns
loan_funded.rename(mapper={'Country':'Nr of projects'}, axis = 1, inplace = True)
loan_funded.columns
Index(['Nr of projects', 'Requested', 'Funded'], dtype='object')
loan_funded_2=loan_funded.reset_index()
loan_funded_2
| Country | Credit types | Nr of projects | Requested | Funded | |
|---|---|---|---|---|---|
| 0 | Afghanistan | bronze | 2 | 14000 | 14000 |
| 1 | Afghanistan | silver | 0 | 0 | 0 |
| 2 | Afghanistan | gold | 0 | 0 | 0 |
| 3 | Afghanistan | platinum | 0 | 0 | 0 |
| 4 | Albania | bronze | 1933 | 2616500 | 2440000 |
| ... | ... | ... | ... | ... | ... |
| 339 | Zambia | platinum | 0 | 0 | 0 |
| 340 | Zimbabwe | bronze | 3972 | 3365900 | 3291225 |
| 341 | Zimbabwe | silver | 1 | 50000 | 50000 |
| 342 | Zimbabwe | gold | 0 | 0 | 0 |
| 343 | Zimbabwe | platinum | 0 | 0 | 0 |
344 rows × 5 columns
pop_density_sel=pd.read_pickle('output/population.pkl')
loan_funded = loan_funded_2.merge(pop_density_sel, on='Country', how='left')
loan_funded
| Country | Credit types | Nr of projects | Requested | Funded | PopDensity | |
|---|---|---|---|---|---|---|
| 0 | Afghanistan | bronze | 2 | 14000 | 14000 | 62.424 |
| 1 | Afghanistan | silver | 0 | 0 | 0 | 62.424 |
| 2 | Afghanistan | gold | 0 | 0 | 0 | 62.424 |
| 3 | Afghanistan | platinum | 0 | 0 | 0 | 62.424 |
| 4 | Albania | bronze | 1933 | 2616500 | 2440000 | 104.612 |
| ... | ... | ... | ... | ... | ... | ... |
| 339 | Zambia | platinum | 0 | 0 | 0 | 26.191 |
| 340 | Zimbabwe | bronze | 3972 | 3365900 | 3291225 | 39.631 |
| 341 | Zimbabwe | silver | 1 | 50000 | 50000 | 39.631 |
| 342 | Zimbabwe | gold | 0 | 0 | 0 | 39.631 |
| 343 | Zimbabwe | platinum | 0 | 0 | 0 | 39.631 |
344 rows × 6 columns
countries = m_v['Country']
densities = [11.102, 87.24, 162.5, 122.167, 84.537, 887.964, 11.027, 16.977, 36.601, 319.133, 204.04]
combi = dict(zip(countries, densities))
for k,v in combi.items():
loan_funded_pop_dens.loc[(loan_funded['Country'] == k) & (loan_funded['PopDensity'].isnull()), 'PopDensity'] = v
loan_funded['Requested'].sum()
560312125
loan_funded['Funded'].sum()
522540915
loan_funded['Funded allocation'] = (loan_funded['Funded'] / loan_funded['PopDensity'])*100
loan_funded
| Country | Credit types | Nr of projects | Requested | Funded | PopDensity | Funded allocation | |
|---|---|---|---|---|---|---|---|
| 0 | Afghanistan | bronze | 2 | 14000 | 14000 | 62.424 | 2.242727e+04 |
| 1 | Afghanistan | silver | 0 | 0 | 0 | 62.424 | 0.000000e+00 |
| 2 | Afghanistan | gold | 0 | 0 | 0 | 62.424 | 0.000000e+00 |
| 3 | Afghanistan | platinum | 0 | 0 | 0 | 62.424 | 0.000000e+00 |
| 4 | Albania | bronze | 1933 | 2616500 | 2440000 | 104.612 | 2.332428e+06 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 339 | Zambia | platinum | 0 | 0 | 0 | 26.191 | 0.000000e+00 |
| 340 | Zimbabwe | bronze | 3972 | 3365900 | 3291225 | 39.631 | 8.304673e+06 |
| 341 | Zimbabwe | silver | 1 | 50000 | 50000 | 39.631 | 1.261639e+05 |
| 342 | Zimbabwe | gold | 0 | 0 | 0 | 39.631 | 0.000000e+00 |
| 343 | Zimbabwe | platinum | 0 | 0 | 0 | 39.631 | 0.000000e+00 |
344 rows × 7 columns
loan_funded['Project allocation'] = (loan_funded['Nr of projects'] / loan_funded['PopDensity'])*100
loan_funded
| Country | Credit types | Nr of projects | Requested | Funded | PopDensity | Funded allocation | Project allocation | |
|---|---|---|---|---|---|---|---|---|
| 0 | Afghanistan | bronze | 2 | 14000 | 14000 | 62.424 | 2.242727e+04 | 3.203896 |
| 1 | Afghanistan | silver | 0 | 0 | 0 | 62.424 | 0.000000e+00 | 0.000000 |
| 2 | Afghanistan | gold | 0 | 0 | 0 | 62.424 | 0.000000e+00 | 0.000000 |
| 3 | Afghanistan | platinum | 0 | 0 | 0 | 62.424 | 0.000000e+00 | 0.000000 |
| 4 | Albania | bronze | 1933 | 2616500 | 2440000 | 104.612 | 2.332428e+06 | 1847.780369 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 339 | Zambia | platinum | 0 | 0 | 0 | 26.191 | 0.000000e+00 | 0.000000 |
| 340 | Zimbabwe | bronze | 3972 | 3365900 | 3291225 | 39.631 | 8.304673e+06 | 10022.457167 |
| 341 | Zimbabwe | silver | 1 | 50000 | 50000 | 39.631 | 1.261639e+05 | 2.523277 |
| 342 | Zimbabwe | gold | 0 | 0 | 0 | 39.631 | 0.000000e+00 | 0.000000 |
| 343 | Zimbabwe | platinum | 0 | 0 | 0 | 39.631 | 0.000000e+00 | 0.000000 |
344 rows × 8 columns
loan_funded['Covered amount'] = (loan_funded['Funded'] / loan_funded['Requested'])*100
loan_funded
| Country | Credit types | Nr of projects | Requested | Funded | PopDensity | Funded allocation | Project allocation | Covered amount | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | Afghanistan | bronze | 2 | 14000 | 14000 | 62.424 | 2.242727e+04 | 3.203896 | 100.000000 |
| 1 | Afghanistan | silver | 0 | 0 | 0 | 62.424 | 0.000000e+00 | 0.000000 | NaN |
| 2 | Afghanistan | gold | 0 | 0 | 0 | 62.424 | 0.000000e+00 | 0.000000 | NaN |
| 3 | Afghanistan | platinum | 0 | 0 | 0 | 62.424 | 0.000000e+00 | 0.000000 | NaN |
| 4 | Albania | bronze | 1933 | 2616500 | 2440000 | 104.612 | 2.332428e+06 | 1847.780369 | 93.254347 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 339 | Zambia | platinum | 0 | 0 | 0 | 26.191 | 0.000000e+00 | 0.000000 | NaN |
| 340 | Zimbabwe | bronze | 3972 | 3365900 | 3291225 | 39.631 | 8.304673e+06 | 10022.457167 | 97.781425 |
| 341 | Zimbabwe | silver | 1 | 50000 | 50000 | 39.631 | 1.261639e+05 | 2.523277 | 100.000000 |
| 342 | Zimbabwe | gold | 0 | 0 | 0 | 39.631 | 0.000000e+00 | 0.000000 | NaN |
| 343 | Zimbabwe | platinum | 0 | 0 | 0 | 39.631 | 0.000000e+00 | 0.000000 | NaN |
344 rows × 9 columns
my_view = px.scatter(data_frame=loan_funded,
x='Requested',
y='Funded',
color='Credit types',
size='Nr of projects',
# color_discrete_map={'bronze':'blue','silver':'red','gold':'orange','platinum':'green'}
)
my_view.update_layout(title={'text':'Credit types',
'y':0.93,
'x':0.5,
'xanchor': 'center',
'yanchor': 'top'})
my_view.show()
my_view = px.scatter(data_frame=loan_funded,
x='Covered amount',
y='Funded allocation',
color='Credit types',
size='Nr of projects',
color_discrete_map={'bronze':'blue','silver':'red','gold':'orange','platinum':'green'})
my_view.update_layout(title={'text':'Credit types in raport to population density',
'y':0.93,
'x':0.5,
'xanchor': 'center',
'yanchor': 'top'})
my_view.show()
sector_order = df.groupby('Sector', as_index = False).agg(Sectors=('Sector','size')).sort_values(by='Sectors', ascending=False)
sector_order
| Sector | Sectors | |
|---|---|---|
| 0 | Agriculture | 174624 |
| 6 | Food | 135359 |
| 11 | Retail | 123186 |
| 12 | Services | 44844 |
| 2 | Clothing | 32554 |
| 8 | Housing | 30155 |
| 4 | Education | 30128 |
| 10 | Personal Use | 25533 |
| 13 | Transportation | 15392 |
| 1 | Arts | 11938 |
| 7 | Health | 9190 |
| 3 | Construction | 6263 |
| 9 | Manufacturing | 6204 |
| 5 | Entertainment | 829 |
| 14 | Wholesale | 634 |
fig = px.bar(sector_order, y='Sectors', x='Sector', text_auto='.2s')
fig.update_layout(title={'text': 'Types of sectors of interest',
'y':0.93,
'x':0.5,
'xanchor': 'center',
'yanchor': 'top'})
fig.update_yaxes(title_text = 'Number of projects\' topics')
fig.show()
# conda install -c conda-forge wordcloud
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import codecs
nltk.download('punkt')
nltk.download('stopwords')
[nltk_data] Downloading package punkt to [nltk_data] /Users/gabrielarotari/nltk_data... [nltk_data] Package punkt is already up-to-date! [nltk_data] Downloading package stopwords to [nltk_data] /Users/gabrielarotari/nltk_data... [nltk_data] Package stopwords is already up-to-date!
True
df_w = df.copy()
df_w_agr= df_w.loc[df_w['Sector'] == 'Agriculture', 'Use'].astype('str')
df_w_agr
6 To purchase a dairy cow and start a milk produ...
14 to expand her existing poultry farm business.
25 to purchase potato seeds and fertilizers for g...
28 to purchase potato seeds and fertilizers for g...
29 to purchase potato seeds and fertilizer for fa...
...
646828 [True, u'para compara: cemento, arenya y ladri...
646829 Reviewed loan use in english.
646830 Pretend the flagged issue was addressed by KC.
646831 Translated loan use to english.
646832 Reviewed loan use in english.
Name: Use, Length: 174624, dtype: object
df_w_agr.isnull().sum()
0
joined_rows = ' '.join(df_w_agr)
#joined_rows
for char in '.:,)!-[(]2':
if char in joined_rows:
joined_rows = joined_rows.replace(char, ' ')
words = joined_rows.split(' ')
#words
words_final = []
for word in words:
if word != "":
words_final.append(word)
import spacy
en = spacy.load('en_core_web_sm')
stopwords = en.Defaults.stop_words
cleaning = []
for token in words_final:
if token not in stopwords:
cleaning.append(token)
from collections import Counter
dict_count = Counter(cleaning)
# dict_count
dict_count.most_common(5)
[('buy', 108464),
('purchase', 47424),
('fertilizer', 33566),
('seeds', 30087),
('farm', 24925)]
my_wordcloud_final = WordCloud(width=800, height=800,
background_color='white',
max_words=300,
# stopwords=STOP_WORDS,
min_font_size=12).generate_from_frequencies(dict_count)
plt.figure(figsize=(10, 10), facecolor=None)
plt.imshow(my_wordcloud_final, interpolation="bilinear") # image displayed more smoothly
plt.axis('off')
plt.tight_layout(pad=10)
plt.show()